started: AL26Apr2019
last updated: AL17Sep2019
Input sequencing data: 1,838 vars x 712 samples ( 515BC = 258UBC + 257CBC and 197NFE)
Input eigenvectors for 712 samples (calculated using 215 non-rare variants not in LD)
Add eigenvectors to phenotypes, make PCA plots and
Mark 26 PC outliers: outside of mean +/- 3*sd on the top 2 eigenvectors
Sys.time()
## [1] "2019-09-17 17:55:51 BST"
rm(list=ls())
graphics.off()
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s13_ampliseq-nfe_only_PCA/s03_explore_PCA_plots_exclude_outliers"
opts_knit$set(root.dir = base_folder)
options(stringsAsFactors = F)
options(warnPartialMatchArgs = T,
warnPartialMatchAttr = T,
warnPartialMatchDollar = T)
#options(error = browser()) # Type Q or c to exit, drop browser level
# https://support.rstudio.com/hc/en-us/articles/200713843?version=1.1.456&mode=desktop
# https://stackoverflow.com/questions/13052522/how-to-leave-the-r-browser-mode-in-the-console-window/13052588
# Threshold for outlier detection
th <- 3 # mean +/- (th*sd)
# Sequencing data (for wecare phenotypes: case/control status)
source_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s11_remove_BRCA_PALB_carriers"
load(paste(source_folder, "s03_exclude_BRCA1_BCRA2_PALB2_carriers.RData", sep="/"))
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s13_ampliseq-nfe_only_PCA/s03_explore_PCA_plots_exclude_outliers"
# Eigenvectors & eigenvalues
source_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s13_ampliseq-nfe_only_PCA/s02_calculate_ampliseq_nfe_only_PCs/data/s04_pca"
eigenvectors_file <- paste(source_folder, "ampliseq_nfe_215_712_100PCs.eigenvec", sep="/")
eigenvalues_file <- paste(source_folder, "ampliseq_nfe_215_712_100PCs.eigenval", sep="/")
eigenvectors.df <- read.table(eigenvectors_file, header=T, sep="\t",quote="")
eigenvalues.df <- read.table(eigenvalues_file, header=F, sep="\t",quote="")
# Clean-up
rm(source_folder, eigenvectors_file, eigenvalues_file)
# List of objects
ls()
## [1] "base_folder" "eigenvalues.df" "eigenvectors.df" "genotypes.mx" "phenotypes.df" "th" "variants.df"
# Dimentions of objects
dim(eigenvectors.df)
## [1] 712 102
dim(eigenvalues.df)
## [1] 100 1
dim(genotypes.mx)
## [1] 1838 712
dim(phenotypes.df)
## [1] 712 24
dim(variants.df)
## [1] 1838 65
# Counts of nfe/controls/cases
table(phenotypes.df$cc)
##
## -1 0 1
## 197 258 257
# Update eigenvectors
rownames(eigenvectors.df) <- eigenvectors.df$FID
eigenvectors.df <- eigenvectors.df[,-1]
"long_ids" -> colnames(eigenvectors.df)[1]
eigenvectors.df[1:5,1:5]
## long_ids PC1 PC2 PC3 PC4
## 100_S8_L007 100_S8_L007 -0.0247118 0.00577574 -0.01270700 0.00726611
## 101_S528_L008 101_S528_L008 -0.0154747 -0.00145424 -0.00755583 -0.03030170
## 102_S466_L008 102_S466_L008 0.0169168 0.07628740 0.00574448 0.01463160
## 103_S147_L007 103_S147_L007 0.1431820 0.03840790 -0.03832170 -0.04580630
## 104_S325_L008 104_S325_L008 0.0129566 -0.02575130 -0.07195110 -0.02135050
plot(eigenvalues.df$V1, type="b",
xlab="Eigenvector", ylab="Eigenvalue",
ylim=c(0,20),main="Top 100 eigenvectors")
plot(eigenvalues.df$V1[1:20], type="b",
xlab="Eigenvector", ylab="Eigenvalue",
ylim=c(0,20), main="Top 20 eigenvectors")
rm(eigenvalues.df)
# Add factor column with verbal lables for phenotypes to be used in plot
group <- factor(phenotypes.df$cc, levels = c(0,1,-1), labels = c("UBC","CBC","NFE"))
phenotypes.df <- data.frame(phenotypes.df,group)
table(phenotypes.df$group)
##
## UBC CBC NFE
## 258 257 197
# Add column with sample IDs for the plot
sample_id <- phenotypes.df$Sample_num
sample_id[phenotypes.df$group=="NFE"] <- phenotypes.df$long_ids[phenotypes.df$group=="NFE"]
phenotypes.df <- data.frame(phenotypes.df,sample_id)
head(phenotypes.df$sample_id)
## [1] "100" "101" "102" "103" "104" "105"
tail(phenotypes.df$sample_id)
## [1] "NA20819" "NA20821" "NA20822" "NA20826" "NA20828" "NA20832"
# Add 5 top eigenvectors to phenotypes
phenotypes.df <- full_join(phenotypes.df,eigenvectors.df[,1:6], by="long_ids")
rownames(phenotypes.df) <- phenotypes.df$long_ids
# Prepare colour scale for ggplots
myColours <- c("NFE"="green", "UBC"="blue", "CBC"="red")
myColourScale <- scale_colour_manual(values=myColours)
# Clean-up
rm(group, sample_id, eigenvectors.df, myColours)
pc1_mean <- mean(phenotypes.df$PC1)
pc1_sd <- sd(phenotypes.df$PC1)
pc1_min <- pc1_mean - pc1_sd * th
pc1_max <- pc1_mean + pc1_sd * th
pc2_mean <- mean(phenotypes.df$PC2)
pc2_sd <- sd(phenotypes.df$PC2)
pc2_min <- pc2_mean - pc2_sd * th
pc2_max <- pc2_mean + pc2_sd * th
pc3_mean <- mean(phenotypes.df$PC3)
pc3_sd <- sd(phenotypes.df$PC3)
pc3_min <- pc3_mean - pc3_sd * th
pc3_max <- pc3_mean + pc3_sd * th
pc4_mean <- mean(phenotypes.df$PC4)
pc4_sd <- sd(phenotypes.df$PC4)
pc4_min <- pc4_mean - pc4_sd * th
pc4_max <- pc4_mean + pc4_sd * th
pc5_mean <- mean(phenotypes.df$PC5)
pc5_sd <- sd(phenotypes.df$PC5)
pc5_min <- pc5_mean - pc5_sd * th
pc5_max <- pc5_mean + pc5_sd * th
rm(pc1_mean, pc1_sd,
pc2_mean, pc2_sd,
pc3_mean, pc3_sd,
pc4_mean, pc4_sd,
pc5_mean, pc5_sd,
th)
# Detect outliers on each PC
pc1_outlier <- phenotypes.df$PC1 < pc1_min | phenotypes.df$PC1 > pc1_max
pc2_outlier <- phenotypes.df$PC2 < pc2_min | phenotypes.df$PC2 > pc2_max
pc3_outlier <- phenotypes.df$PC3 < pc3_min | phenotypes.df$PC3 > pc3_max
pc4_outlier <- phenotypes.df$PC4 < pc4_min | phenotypes.df$PC4 > pc4_max
pc5_outlier <- phenotypes.df$PC5 < pc5_min | phenotypes.df$PC5 > pc5_max
# Detect outliers on any PC
pc_outlier <- pc1_outlier | pc2_outlier
#pc_outlier <- pc1_outlier | pc2_outlier | pc3_outlier | pc4_outlier | pc5_outlier
# Check counts of outliers
sum(pc1_outlier)
## [1] 18
sum(pc2_outlier)
## [1] 9
sum(pc3_outlier)
## [1] 3
sum(pc4_outlier)
## [1] 7
sum(pc5_outlier)
## [1] 3
sum(pc_outlier)
## [1] 26
# Add outliers data to phenotypes dataframe
phenotypes.df <- data.frame(phenotypes.df, pc_outlier, pc1_outlier, pc2_outlier,
pc3_outlier, pc4_outlier, pc5_outlier)
phenotypes.df %>%
filter(pc_outlier) %>%
select(sample_id, pc_outlier, pc1_outlier, pc2_outlier,
pc3_outlier, pc4_outlier, pc5_outlier) %>%
arrange(as.integer(sample_id))
## sample_id pc_outlier pc1_outlier pc2_outlier pc3_outlier pc4_outlier pc5_outlier
## 1 3 TRUE TRUE FALSE FALSE FALSE FALSE
## 2 16 TRUE TRUE FALSE FALSE FALSE FALSE
## 3 103 TRUE TRUE FALSE FALSE FALSE TRUE
## 4 141 TRUE TRUE TRUE FALSE FALSE FALSE
## 5 148 TRUE TRUE FALSE FALSE FALSE FALSE
## 6 235 TRUE FALSE TRUE FALSE FALSE FALSE
## 7 238 TRUE FALSE TRUE FALSE FALSE FALSE
## 8 246 TRUE TRUE FALSE FALSE FALSE FALSE
## 9 256 TRUE FALSE TRUE FALSE FALSE FALSE
## 10 267 TRUE TRUE FALSE FALSE FALSE FALSE
## 11 273 TRUE TRUE FALSE FALSE FALSE FALSE
## 12 275 TRUE TRUE FALSE FALSE FALSE FALSE
## 13 277 TRUE TRUE FALSE FALSE FALSE FALSE
## 14 287 TRUE FALSE TRUE FALSE FALSE FALSE
## 15 293 TRUE FALSE TRUE FALSE FALSE FALSE
## 16 308 TRUE FALSE TRUE FALSE FALSE FALSE
## 17 326 TRUE FALSE TRUE FALSE FALSE FALSE
## 18 329 TRUE TRUE FALSE FALSE FALSE FALSE
## 19 347 TRUE TRUE FALSE FALSE FALSE FALSE
## 20 366 TRUE TRUE FALSE FALSE FALSE FALSE
## 21 368 TRUE TRUE FALSE FALSE FALSE FALSE
## 22 369 TRUE FALSE TRUE FALSE FALSE FALSE
## 23 385 TRUE TRUE FALSE FALSE FALSE FALSE
## 24 388 TRUE TRUE FALSE FALSE FALSE FALSE
## 25 403 TRUE TRUE FALSE FALSE FALSE FALSE
## 26 408 TRUE TRUE FALSE FALSE FALSE FALSE
# Clean-up
rm(pc_outlier, pc1_outlier, pc2_outlier, pc3_outlier, pc4_outlier, pc5_outlier)
# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC1,PC2)) +
geom_point(aes(col=group, text=sample_id)) +
labs(title="PC1 vs PC2", x="PC1", y="PC2") +
theme(legend.title=element_blank()) + # To suppress the legend title
myColourScale + # otherwise it would be "group"
geom_vline(xintercept=c(pc1_min, pc1_max), linetype="dotted", size=0.2) +
geom_hline(yintercept=c(pc2_min, pc2_max), linetype="dotted", size=0.2)
## Warning: Ignoring unknown aesthetics: text
# Draw static ggplot
#g
# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)
# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC2,PC3)) +
geom_point(aes(col=group, text=sample_id)) +
labs(title="PC2 vs PC3", x="PC2", y="PC3") +
theme(legend.title=element_blank()) +
myColourScale +
geom_vline(xintercept=c(pc2_min, pc2_max), linetype="dotted", size=0.2)
## Warning: Ignoring unknown aesthetics: text
# geom_hline(yintercept=c(pc3_min, pc3_max), linetype="dotted", size=0.2)
# Draw static ggplot
#g
# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)
# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC3,PC4)) +
geom_point(aes(col=group, text=sample_id)) +
labs(title="PC3 vs PC4", x="PC3", y="PC4") +
theme(legend.title=element_blank()) +
myColourScale
## Warning: Ignoring unknown aesthetics: text
# geom_vline(xintercept=c(pc3_min, pc3_max), linetype="dotted", size=0.2) +
# geom_hline(yintercept=c(pc4_min, pc4_max), linetype="dotted", size=0.2)
# Draw static ggplot
#g
# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)
# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC4,PC5)) +
geom_point(aes(col=group, text=sample_id)) +
labs(title="PC4 vs PC5", x="PC4", y="PC5") +
theme(legend.title=element_blank()) +
myColourScale
## Warning: Ignoring unknown aesthetics: text
# geom_vline(xintercept=c(pc4_min, pc4_max), linetype="dotted", size=0.2) +
# geom_hline(yintercept=c(pc5_min, pc5_max), linetype="dotted", size=0.2)
# Draw static ggplot
#g
# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g, myColourScale,
pc1_min, pc1_max, pc2_min, pc2_max, pc3_min, pc3_max, pc4_min, pc4_max, pc5_min, pc5_max)
save.image(paste(base_folder, "s01_add_PCs.RData", sep="/"))
ls()
## [1] "base_folder" "genotypes.mx" "phenotypes.df" "variants.df"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plotly_4.9.0 ggplot2_3.2.0 dplyr_0.8.1 knitr_1.23
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.1 later_0.8.0 pillar_1.4.1 compiler_3.5.1 tools_3.5.1 digest_0.6.19 jsonlite_1.6 evaluate_0.14 tibble_2.1.3 gtable_0.3.0 viridisLite_0.3.0 pkgconfig_2.0.2 rlang_0.3.4 shiny_1.3.2 crosstalk_1.0.0 yaml_2.2.0 xfun_0.7 withr_2.1.2 stringr_1.4.0 httr_1.4.0 htmlwidgets_1.3 grid_3.5.1 tidyselect_0.2.5 glue_1.3.1 data.table_1.12.2 R6_2.4.0 rmarkdown_1.13 purrr_0.3.2 tidyr_0.8.3 magrittr_1.5 promises_1.0.1 scales_1.0.0 htmltools_0.3.6 assertthat_0.2.1 xtable_1.8-4 mime_0.7 colorspace_1.4-1 httpuv_1.5.1 labeling_0.3 stringi_1.4.3 lazyeval_0.2.2 munsell_0.5.0 crayon_1.3.4
Sys.time()
## [1] "2019-09-17 17:55:54 BST"